{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "from sklearn.utils import shuffle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "read_dir='E:/文档/数据集/Data Sets used in Computing Education/junyi/'\n",
    "save_dir='../data/junyi/'\n",
    "\n",
    "log=pd.read_csv(read_dir+'junyi_ProblemLog_original.csv')\n",
    "\n",
    "prob_count_limit=15 #每个学生做过的习题下限"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "data=log.loc[:,['user_id','exercise','correct']].drop_duplicates(subset=['user_id','exercise'],\n",
    "        keep='first').dropna(axis=0,how='any')\n",
    "data['correct']=data['correct']*1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "有效的学生数： 36591\n"
     ]
    }
   ],
   "source": [
    "# 统计每个学生做了多少道题\n",
    "problem_counter=data.groupby(by='user_id').count()\n",
    "\n",
    "filtered_stu_boundary=prob_count_limit #学生做的题超过15道才算数\n",
    "\n",
    "filtered_stu_id=problem_counter[problem_counter['exercise']>filtered_stu_boundary].index.to_numpy()\n",
    "\n",
    "print('有效的学生数：',len(filtered_stu_id))\n",
    "\n",
    "record=data.set_index('user_id').loc[filtered_stu_id,:].reset_index()\n",
    "record.columns=['user_id','item_id','score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_conc_data=record.loc[:,['item_id','item_id']].dropna(axis=0,how='any').drop_duplicates()\n",
    "item_conc_data.columns=['item_id','knowledge_code']\n",
    "item_conc_data=item_conc_data.set_index('item_id')\n",
    "item_conc_data=item_conc_data[~item_conc_data.index.duplicated()]\n",
    "item_conc_data=item_conc_data.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_unique=np.unique(record['item_id'])\n",
    "know_unique=np.unique(item_conc_data['knowledge_code'])\n",
    "stu_unique=np.unique(record['user_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "stu_old_new=dict(zip(stu_unique,range(1,len(stu_unique)+1)))\n",
    "item_old_new=dict(zip(item_unique,range(1,len(item_unique)+1)))\n",
    "know_old_new=dict(zip(know_unique,range(1,len(know_unique)+1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(save_dir+'dict_knowledge_code.json','w') as f:\n",
    "    json.dump(know_old_new,f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done\n"
     ]
    }
   ],
   "source": [
    "record['user_id']=record['user_id'].map(stu_old_new)\n",
    "record['item_id']=record['item_id'].map(item_old_new)\n",
    "\n",
    "item_conc_data['item_id']=item_conc_data['item_id'].map(item_old_new)\n",
    "item_conc_data['knowledge_code']=item_conc_data['knowledge_code'].map(know_old_new)\n",
    "\n",
    "print('Done')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_conc_data=item_conc_data.set_index('item_id')\n",
    "\n",
    "item_df=pd.DataFrame(columns=['item_id','knowledge_code'],index=range(1,len(item_unique)+1))\n",
    "for item in range(1,len(item_unique)+1):\n",
    "    item_df.loc[item,'item_id']=item\n",
    "    item_df.loc[item,'knowledge_code']=np.array(item_conc_data.loc[item,['knowledge_code']]).reshape(-1).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_df.to_csv(save_dir+'item.csv',index=False)\n",
    "record.to_csv(save_dir+'record.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "学习者数： 36591\n",
      "习题数： 721\n",
      "知识点数： 721\n",
      "记录数： 1550016\n"
     ]
    }
   ],
   "source": [
    "print('学习者数：',len(stu_unique))\n",
    "print('习题数：',len(item_unique))\n",
    "print('知识点数：',len(know_unique))\n",
    "print('记录数：',len(record))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "relation_data_dir='E:/文档/数据集/Data Sets used in Computing Education/junyi/'\n",
    "prob_data=pd.read_csv(relation_data_dir+'junyi_Exercise_table.csv')\n",
    "\n",
    "prerequisites=prob_data.loc[:,['name','prerequisites']]\n",
    "prerequisites['name']=prerequisites['name'].map(know_old_new)\n",
    "prerequisites['prerequisites']=prerequisites['prerequisites'].map(know_old_new)\n",
    "\n",
    "prerequisites=prerequisites.dropna(how='any').astype('int')\n",
    "prerequisites.columns=['Exercise_A','Exercise_B']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "relation_test=pd.read_csv(relation_data_dir+'relationship_annotation_testing.csv')\n",
    "relation_train=pd.read_csv(relation_data_dir+'relationship_annotation_training.csv')\n",
    "\n",
    "relation_data=pd.concat([relation_test,relation_train]).loc[:,['Exercise_A','Exercise_B','Similarity_avg']]\n",
    "relation_data=relation_data[relation_data['Similarity_avg']>=5].loc[:,['Exercise_A','Exercise_B']]\n",
    "relation_data['Exercise_A']=relation_data['Exercise_A'].map(know_old_new)\n",
    "relation_data['Exercise_B']=relation_data['Exercise_B'].map(know_old_new)\n",
    "relation_data=relation_data.dropna(how='any').astype('int')\n",
    "\n",
    "# 相似关系是相互的\n",
    "relation_data_2=pd.DataFrame(columns=['Exercise_A','Exercise_B']) \n",
    "relation_data_2['Exercise_A']=relation_data['Exercise_B']\n",
    "relation_data_2['Exercise_B']=relation_data['Exercise_A']\n",
    "\n",
    "relationship=pd.concat([prerequisites,relation_data,relation_data_2]).drop_duplicates(keep='first')\n",
    "relationship.columns=['knowledge_code','parent']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "relationship.to_csv(save_dir+'concept_relationship.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "527a93331b4b1a8345148922acc34427fb7591433d63b66d32040b6fbbc6d593"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit ('pytorch': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
